In [1]:
# import libraries 
import numpy as np
import pandas as pd
import re
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.utils import shuffle
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import LabelEncoder
from patsy import dmatrices
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from scipy.stats import entropy
import statsmodels.formula.api as sm
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import itertools
import inspect
import datetime
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2
from scipy.stats import entropy
import datetime
import plotly.graph_objects as go
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
import seaborn as sns
import matplotlib.pyplot as plt
from dateutil.relativedelta import relativedelta
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import model_selection,ensemble,metrics
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
import joblib
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from subprocess import check_output
from sklearn import model_selection
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import chi2_contingency
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import pandas as pd
from sklearn.linear_model import LogisticRegression
from statsmodels.tools import add_constant
import statsmodels.api as sm
In [2]:
#importing data
data1=pd.read_excel(r"C:\Users\40102404\Downloads\BE\model_validation.xlsx")
# Separate the features and target variable
data = data1.drop('customer', axis=1)
X = data.drop('default', axis=1)
y = data['default']

X = add_constant(X)

# Create the logistic regression model
logit_model = sm.Logit(y, X)

# Fit the model on the data
result = logit_model.fit()

# Get the coefficients and intercept
coefficients = result.params[1:]
intercept = result.params[0]

# Create a dataframe with the coefficients and intercept
coefficients_df = pd.DataFrame({'Variable': X.columns[1:], 'Coefficient': coefficients})
coefficients_df = coefficients_df.append({'Variable': 'Intercept', 'Coefficient': intercept}, ignore_index=True)

# Add the probability of default to the dataframe
data['Prob'] = result.predict(X)

print(coefficients_df)
print(data)
Optimization terminated successfully.
         Current function value: 0.454353
         Iterations 8
    Variable  Coefficient
0        age    -0.015311
1         ed    -0.028474
2     employ    -0.225346
3    address    -0.038665
4     income    -0.002756
5    debtinc     0.076747
6   creddebt     0.557631
7    othdebt     0.026705
8  Intercept    -0.414920
     age  ed  employ  address  income  debtinc  creddebt  othdebt  default      Prob
0     28   2       7        2      44     17.7      2.99     4.80        0  0.617119
1     64   5      34       17     116     14.7      5.05    12.00        0  0.002701
2     40   1      20       12      61      4.8      1.04     1.89        0  0.005508
3     30   1      11        3      27     34.5      1.75     7.56        0  0.563058
4     25   1       2        2      30     22.4      0.76     5.96        1  0.703751
..   ...  ..     ...      ...     ...      ...       ...      ...      ...       ...
895   23   1       0        2      20      8.4      0.24     1.44        1  0.472271
896   28   3       3        3      25     18.8      1.16     3.54        1  0.597306
897   26   3       0        3      31      3.1      0.10     0.86        1  0.313629
898   36   5       3        7      45      3.3      0.47     1.02        0  0.162929
899   32   4       2        2      55     10.6      1.43     4.40        1  0.507518

[900 rows x 10 columns]
C:\Users\40102404\AppData\Local\Temp\ipykernel_10204\3736388604.py:22: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  coefficients_df = coefficients_df.append({'Variable': 'Intercept', 'Coefficient': intercept}, ignore_index=True)
In [3]:
data['prob_default']=np.where(data['Prob']>.4,1,0)
In [4]:
factor=28.8539
offset=487.123
In [5]:
data['score']=factor*np.log((1-data['Prob'])/data['Prob'])+offset
In [6]:
data['score']
Out[6]:
0      473.349942
1      657.685614
2      637.048893
3      479.806212
4      462.157872
          ...    
895    490.326679
896    475.747291
897    509.721537
898    534.345162
899    486.255272
Name: score, Length: 900, dtype: float64
In [7]:
#assigning grades
data['grade']=np.where(data['Prob']<=.0017,1,np.where(data['Prob']<.0029,2,np.where(data['Prob']<.0050,3,np.where(data['Prob']<.0088,4,np.where(data['Prob']<.0155,5,
np.where(data['Prob']<.0272,6,np.where(data['Prob']<.0476,7,np.where(data['Prob']<.0833,8,np.where(data['Prob']<.1458,9,np.where(data['Prob']<.2552,10,np.where(data['Prob']<.4466,11,np.where(data['Prob']<.7686,12,13))))))))))))
In [8]:
#AUC
AUROC = roc_auc_score(data['default'],data['Prob'])
print(" Area under ROC  :",AUROC)
 Area under ROC  : 0.8429353930341373
In [9]:
print(classification_report(data['default'],data['prob_default']))
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       571
           1       0.63      0.70      0.66       329

    accuracy                           0.74       900
   macro avg       0.72      0.73      0.73       900
weighted avg       0.75      0.74      0.74       900

In [10]:
#Gini Index
Gini = AUROC * 2 - 1
print("Gini Index  :",Gini)
Gini Index  : 0.6858707860682747
In [11]:
#Confusion Matrix
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(data['default'],data['prob_default'])
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for  data')
display.plot(ax=ax);
In [12]:
# Calculate Somers' D
X['probability'] = result.predict(X)
concordant_pairs = ((y == 1) & (X['probability'] > 0.4)).sum()
discordant_pairs = ((y == 0) & (X['probability'] > 0.4)).sum()
total_pairs = concordant_pairs + discordant_pairs
somers_d = (concordant_pairs - discordant_pairs) / total_pairs
print("\nSomers' D:")
print(somers_d)
Somers' D:
0.25885558583106266
In [13]:
# Calculate the chi-square statistic for the decile
data2 = data1.drop('customer', axis=1)
X = data2.drop('default', axis=1)
y = data2['default']

X = add_constant(X)
# Calculate the probability values of default
X_without_intercept = X.drop('const', axis=1)
probabilities = result.predict(X)

deciles = pd.qcut(probabilities, q=10, labels=False)
decile_counts = pd.crosstab(deciles, y)
chi_square = decile_counts.apply(lambda x: (x - x.mean()) ** 2 / x.mean()).sum().sum()
print(chi_square)
306.9435055014665
In [14]:
#Cumulative Information Efficiency Ratio (CIER) 
def calculate_cier(y_true, y_pred):
    n = len(y_true)
    sorted_indices = np.argsort(y_pred)
    sorted_true = np.asarray(y_true)[sorted_indices]
    sorted_pred = np.asarray(y_pred)[sorted_indices]
    cum_sum_true = np.cumsum(sorted_true)
    cum_sum_pred = np.cumsum(sorted_pred)
    cier = np.sum(cum_sum_true / np.arange(1, n + 1)) / np.sum(cum_sum_pred / np.arange(1, n + 1))
    return cier
cier = calculate_cier(data['default'],data['Prob'])
In [15]:
# Binomial normal test
def perform_binomial_normal_test(y_true, y_pred):
    n = len(y_true)
    num_events = sum(y_true)
    p_pred = sum(y_pred) / n
    p_actual = num_events / n
    p_diff = p_pred - p_actual
    std_error = np.sqrt(p_actual * (1 - p_actual) / n)
    z_score = p_diff / std_error
    binomial_test = 2 * (1 - stats.norm.cdf(np.abs(z_score)))
    
    return binomial_test


binomial_test = perform_binomial_normal_test(data['default'],data['Prob'])
In [16]:
#hosmer lemeshow test
def perform_hosmer_lemeshow_test(y_true, y_pred, num_bins=10):
    df = pd.DataFrame({'ground_truth': y_true, 'predictions': y_pred})
    df['decile'] = pd.qcut(df['predictions'], num_bins, labels=False)
    grouped = df.groupby('decile', as_index=False)
    
    observed_events = grouped['ground_truth'].sum()
    observed_nonevents = grouped['ground_truth'].count() - observed_events
    expected_events = grouped['predictions'].sum()
    expected_nonevents = grouped['predictions'].count() - expected_events
    
    chi_square = (((observed_events - expected_events) ** 2) / expected_events).sum() + \
                 (((observed_nonevents - expected_nonevents) ** 2) / expected_nonevents).sum()
    
    degrees_of_freedom = num_bins - 2
    lm_test = 1 - chi2.cdf(chi_square, degrees_of_freedom)

    return lm_test


lm_test = perform_hosmer_lemeshow_test(data['default'],data['Prob'] )

print('lm_test:', lm_test)
lm_test: [1. 1. 1.]
In [17]:
# CAP
total = len(data['default'])
class_1_count = np.sum(data['default'])
class_0_count = total - class_1_count
plt.figure(figsize = (20, 12))

# Random Model
plt.plot([0, total], [0, class_1_count], c = 'r', linestyle = '--', label = 'Random Model')

# Perfect Model
plt.plot([0, class_1_count, total], 
         [0, class_1_count, class_1_count], 
         c = 'grey', 
         linewidth = 2, 
         label = 'Perfect Model')

# Trained Model

probs = data['Prob']
model_y = [y for _, y in sorted(zip(probs, data['default']), reverse = True)]
y_values = np.append([0], np.cumsum(model_y))
x_values = np.arange(0, total + 1)
plt.plot(x_values, 
         y_values, 
         c = 'b', 
         label = 'Credit risk model', 
         linewidth = 4)

# Plot information
plt.xlabel('Total observations', fontsize = 7)
plt.ylabel('Class 1 observations', fontsize = 7)
plt.title('Cumulative Accuracy Profile', fontsize = 7)
plt.legend(loc = 'lower right', fontsize = 7)
Out[17]:
<matplotlib.legend.Legend at 0x227736aee80>
In [18]:
# Kendalltau

res = stats.kendalltau(data['default'], data['Prob'])
res.correlation
Out[18]:
0.46738223959132313
In [19]:
#Brier score
brier_score_loss = metrics.brier_score_loss(data['default'],data['Prob'])
print("{} : {:.4f}".format("Brier Score Loss", brier_score_loss))
Brier Score Loss : 0.1527
In [20]:
#Entropy and KL divergence
def calculate_unconditional_entropy(y_true):
    true_distribution = np.histogram(y_true, bins=10, range=(0, 1), density=True)[0]
    unconditional_entropy = entropy(true_distribution)

    return unconditional_entropy

def calculate_conditional_entropy(y_true, y_pred):
    true_distribution = np.histogram(y_true, bins=10, range=(0, 1), density=True)[0]
    pred_distribution = np.histogram(y_pred, bins=10, range=(0, 1), density=True)[0]

    conditional_entropy = entropy(true_distribution, pred_distribution)

    return conditional_entropy



unconditional_entropy = calculate_unconditional_entropy(data['default'])
conditional_entropy = calculate_conditional_entropy(data['default'],data['Prob'])
KL_divergence=conditional_entropy-unconditional_entropy
In [21]:
# KS statistics 
def k_s_statistics_gain_lift(data,predicted_probability,ground_truth,response_name='default'):
    data= data.sort_values(by=predicted_probability, ascending=False)
    data['decile_group'] = pd.qcut(data[predicted_probability], q=10)
    KS_data = data.groupby('decile_group').agg( 
            [
                'count', 
                'sum', 
            ]
            )[ground_truth].sort_index(ascending=False)
    KS_data.columns = ['Total count','Number of '+response_name]
    KS_data['Number of '+'Non-'+response_name]=KS_data['Total count']-KS_data['Number of '+response_name]
    KS_data[response_name+'_Rate'+'%'] = (KS_data['Number of '+response_name] / KS_data['Total count']).apply(lambda x:round(100*x,2))
    KS_data['Percent of '+response_name+'%'] = (KS_data['Number of '+response_name]/KS_data['Number of '+response_name].sum()).apply(lambda x:round(100*x,2))
    KS_data['Percent of '+'Non-'+response_name+'%'] = (KS_data['Number of '+'Non-'+response_name]/KS_data['Number of '+'Non-'+response_name].sum()).apply(lambda x:round(100*x,2))
    KS_data['ks_stats'] = np.round(((KS_data['Number of '+response_name] / KS_data['Number of '+response_name].sum()).cumsum() -(KS_data['Number of '+'Non-'+response_name] / KS_data['Number of '+'Non-'+response_name].sum()).cumsum()), 4) * 100
    KS_data['max_ks'] = KS_data['ks_stats'].apply(lambda x: '*****' if x == KS_data['ks_stats'].max() else '')
   
    KS_data['Gain'] = KS_data['Percent of '+response_name+'%'].cumsum() 
    
    KS_data['Lift'] = (KS_data['Gain']/np.array(range(10,100+10,10))).apply(lambda x:round(x,2))     
    return KS_data
In [22]:
#KS statistics table
logistic_ks_data=k_s_statistics_gain_lift(data=data,predicted_probability='Prob',ground_truth='default')
logistic_ks_data
Out[22]:
Total count Number of default Number of Non-default default_Rate% Percent of default% Percent of Non-default% ks_stats max_ks Gain Lift
decile_group
(0.795, 1.0] 90 84 6 93.33 25.53 1.05 24.48 25.53 2.55
(0.626, 0.795] 90 60 30 66.67 18.24 5.25 37.46 43.77 2.19
(0.5, 0.626] 90 48 42 53.33 14.59 7.36 44.70 58.36 1.95
(0.408, 0.5] 90 37 53 41.11 11.25 9.28 46.66 69.61 1.74
(0.324, 0.408] 90 43 47 47.78 13.07 8.23 51.50 ***** 82.68 1.65
(0.242, 0.324] 90 24 66 26.67 7.29 11.56 47.24 89.97 1.50
(0.169, 0.242] 90 18 72 20.00 5.47 12.61 40.10 95.44 1.36
(0.0784, 0.169] 90 12 78 13.33 3.65 13.66 30.09 99.09 1.24
(0.0142, 0.0784] 90 3 87 3.33 0.91 15.24 15.76 100.00 1.11
(-0.00099818, 0.0142] 90 0 90 0.00 0.00 15.76 -0.00 100.00 1.00
In [23]:
#ploting KS statistics
def plot_cdfs_credit_risk(data, predicted_probability, ground_truth, response_name='default'):
    # Sort the data by predicted probability
    sorted_data = data.sort_values(predicted_probability)

    # Separate the data into default and non-default groups
    default_data = sorted_data[sorted_data[ground_truth] == 1]
    non_default_data = sorted_data[sorted_data[ground_truth] == 0]

    # Calculate the cumulative probabilities
    default_cdf = np.cumsum(default_data[predicted_probability]) / np.sum(default_data[predicted_probability])
    non_default_cdf = np.cumsum(non_default_data[predicted_probability]) / np.sum(non_default_data[predicted_probability])

    # Plot the CDFs
    plt.plot(default_data[predicted_probability], default_cdf, label=response_name)
    plt.plot(non_default_data[predicted_probability], non_default_cdf, label='Non-' + response_name)
    plt.xlabel('Predicted Probability')
    plt.ylabel('Cumulative Probability')
    plt.title('CDFs in Credit Risk')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Calculate the KS statistic
    ks_statistic = np.max(np.abs(default_cdf - non_default_cdf))

    return ks_statistic


ks_statistic = plot_cdfs_credit_risk(data=data,predicted_probability='Prob',ground_truth='default')
In [24]:
#gain Chart

def model_selection_by_gain_chart(model_gains_dict):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=list(range(0,100+10,10)),
                    mode='lines+markers',name='Random Model'))
    for model_name,model_gains in model_gains_dict.items():
        model_gains.insert(0,0)
        fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=model_gains,
                    mode='lines+markers',name=model_name))
    fig.update_xaxes(
        title_text = "% of Data Set",)

    fig.update_yaxes(title_text = "% of Gain",)
    fig.update_layout(title='Gain Charts',)
    fig.show()
model_selection_by_gain_chart(model_gains_dict={'Credit risk model':logistic_ks_data.Gain.to_list()})  
In [25]:
# lift chart
def model_selection_by_lift_chart(model_lift_dict):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=np.repeat(1,10),
                    mode='lines+markers',name='Random Lift'))
    for model_name,model_lifts in model_lift_dict.items():
        fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=model_lifts,
                    mode='lines+markers',name=model_name))
    fig.update_xaxes(
        title_text = "% of Data Set",)

    fig.update_yaxes(title_text = "Lift",)
    fig.update_layout(title='Lift Charts',)
    fig.show()
model_selection_by_lift_chart(model_lift_dict={'Credit Risk Model':logistic_ks_data.Lift.to_list()})
In [26]:
# Cohen's d statistic 
def calculate_cohens_d(y_true, y_pred):
    mean_diff = np.mean(y_true) - np.mean(y_pred)
    pooled_std = np.sqrt((np.var(y_true) + np.var(y_pred)) / 2)
    cohens_d = mean_diff / pooled_std
    
    return cohens_d

cohens_d = calculate_cohens_d(data['default'],data['Prob'])

print("Cohen's d:", cohens_d)
Cohen's d: 0.0
In [27]:
#importing new datasets
psi=pd.read_excel(r"C:\Users\40102404\Downloads\BE\PSI.xlsx")
In [28]:
psi
Out[28]:
customer age ed employ address income debtinc creddebt othdebt
0 360036 24 4 0 1 56 9.3 1.55 3.66
1 360188 19 2 0 0 55 7.9 0.61 3.73
2 360230 28 4 2 3 22 6.2 0.76 0.61
3 360245 40 5 1 11 32 18.6 3.40 2.55
4 360349 33 5 6 4 121 15.1 3.31 14.96
... ... ... ... ... ... ... ... ... ...
595 453471 34 3 8 4 83 11.0 1.85 7.28
596 453578 37 2 10 8 43 3.6 0.81 0.74
597 453686 25 5 0 3 16 3.2 0.29 0.22
598 453698 34 1 10 8 41 14.5 1.19 4.75
599 453777 27 1 2 2 24 5.8 0.56 0.84

600 rows × 9 columns

In [29]:
#calculating score and probablity of new dataset
psi = psi.drop('customer', axis=1)
psi = add_constant(psi)
psi['Prob']=result.predict(psi)
psi['score']=factor*np.log((1-psi['Prob'])/psi['Prob'])+offset
In [30]:
#assigning grade to new dataset
psi['grade']=np.where(psi['Prob']<=.0017,1,np.where(psi['Prob']<.0029,2,np.where(psi['Prob']<.0050,3,np.where(psi['Prob']<.0088,4,np.where(psi['Prob']<.0155,5,
np.where(psi['Prob']<.0272,6,np.where(psi['Prob']<.0476,7,np.where(psi['Prob']<.0833,8,np.where(psi['Prob']<.1458,9,np.where(psi['Prob']<.2552,10,np.where(psi['Prob']<.4466,11,np.where(psi['Prob']<.7686,12,13))))))))))))
In [31]:
psi
Out[31]:
const age ed employ address income debtinc creddebt othdebt Prob score grade
0 1.0 24 4 0 1 56 9.3 1.55 3.66 0.642572 470.198938 12
1 1.0 19 2 0 0 55 7.9 0.61 3.73 0.532882 483.322444 12
2 1.0 28 4 2 3 22 6.2 0.76 0.61 0.338740 506.423802 11
3 1.0 40 5 1 11 32 18.6 3.40 2.55 0.815019 444.333841 13
4 1.0 33 5 6 4 121 15.1 3.31 14.96 0.622788 472.655720 12
... ... ... ... ... ... ... ... ... ... ... ... ...
595 1.0 34 3 8 4 83 11.0 1.85 7.28 0.242903 519.924958 10
596 1.0 37 2 10 8 43 3.6 0.81 0.74 0.048711 572.874377 8
597 1.0 25 5 0 3 16 3.2 0.29 0.22 0.334707 506.944803 11
598 1.0 34 1 10 8 41 14.5 1.19 4.75 0.149761 537.227016 10
599 1.0 27 1 2 2 24 5.8 0.56 0.84 0.338248 506.487185 11

600 rows × 12 columns

In [32]:
#calculation PSI values across different grade
dev = data['grade'].value_counts(normalize=True)
new = psi['grade'].value_counts(normalize=True)
df = pd.concat([dev, new], axis=1, keys=['dev', 'new'])
df = df.sort_index()
df['PSI'] = (df['dev']-df['new']) * np.log(df['dev'] / df['new'])
In [33]:
df['PSI']
Out[33]:
1     0.000153
2     0.011297
3     0.000927
4     0.002839
5     0.000569
6     0.001808
7     0.000194
8     0.000567
9     0.000004
10    0.000080
11    0.001446
12    0.001425
13    0.006545
Name: PSI, dtype: float64
In [34]:
#total PSI value
np.sum(df['PSI'])
Out[34]:
0.027854160655260505